{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5f53531a",
   "metadata": {},
   "source": [
    "### 加载数据并转换为DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "47222493",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_breast_cancer\n",
    "breast_cancer = load_breast_cancer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "aa152f0c",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = breast_cancer['data']\n",
    "target = breast_cancer['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ff4380e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((569, 30), (569,))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape, target.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12531ea5",
   "metadata": {},
   "source": [
    "- 合并为一个ndarray数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "9d458669",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "90f41178",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 4.601e-01, 1.189e-01,\n",
       "         0.000e+00],\n",
       "        [2.057e+01, 1.777e+01, 1.329e+02, ..., 2.750e-01, 8.902e-02,\n",
       "         0.000e+00],\n",
       "        [1.969e+01, 2.125e+01, 1.300e+02, ..., 3.613e-01, 8.758e-02,\n",
       "         0.000e+00],\n",
       "        ...,\n",
       "        [1.660e+01, 2.808e+01, 1.083e+02, ..., 2.218e-01, 7.820e-02,\n",
       "         0.000e+00],\n",
       "        [2.060e+01, 2.933e+01, 1.401e+02, ..., 4.087e-01, 1.240e-01,\n",
       "         0.000e+00],\n",
       "        [7.760e+00, 2.454e+01, 4.792e+01, ..., 2.871e-01, 7.039e-02,\n",
       "         1.000e+00]]),\n",
       " (569, 31))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_datas = np.c_[data, target]\n",
    "all_datas, all_datas.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e1ed376",
   "metadata": {},
   "source": [
    "- 转换为DataFrame数据并给出列名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "67a8cb4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8aea65b9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean radius</th>\n",
       "      <th>mean texture</th>\n",
       "      <th>mean perimeter</th>\n",
       "      <th>mean area</th>\n",
       "      <th>mean smoothness</th>\n",
       "      <th>mean compactness</th>\n",
       "      <th>mean concavity</th>\n",
       "      <th>mean concave points</th>\n",
       "      <th>mean symmetry</th>\n",
       "      <th>mean fractal dimension</th>\n",
       "      <th>...</th>\n",
       "      <th>worst texture</th>\n",
       "      <th>worst perimeter</th>\n",
       "      <th>worst area</th>\n",
       "      <th>worst smoothness</th>\n",
       "      <th>worst compactness</th>\n",
       "      <th>worst concavity</th>\n",
       "      <th>worst concave points</th>\n",
       "      <th>worst symmetry</th>\n",
       "      <th>worst fractal dimension</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.30010</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>0.2419</td>\n",
       "      <td>0.07871</td>\n",
       "      <td>...</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.16220</td>\n",
       "      <td>0.66560</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.08690</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>0.1812</td>\n",
       "      <td>0.05667</td>\n",
       "      <td>...</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.12380</td>\n",
       "      <td>0.18660</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.19740</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>0.2069</td>\n",
       "      <td>0.05999</td>\n",
       "      <td>...</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.14440</td>\n",
       "      <td>0.42450</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.24140</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>0.2597</td>\n",
       "      <td>0.09744</td>\n",
       "      <td>...</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.20980</td>\n",
       "      <td>0.86630</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.19800</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>0.1809</td>\n",
       "      <td>0.05883</td>\n",
       "      <td>...</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.13740</td>\n",
       "      <td>0.20500</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>564</th>\n",
       "      <td>21.56</td>\n",
       "      <td>22.39</td>\n",
       "      <td>142.00</td>\n",
       "      <td>1479.0</td>\n",
       "      <td>0.11100</td>\n",
       "      <td>0.11590</td>\n",
       "      <td>0.24390</td>\n",
       "      <td>0.13890</td>\n",
       "      <td>0.1726</td>\n",
       "      <td>0.05623</td>\n",
       "      <td>...</td>\n",
       "      <td>26.40</td>\n",
       "      <td>166.10</td>\n",
       "      <td>2027.0</td>\n",
       "      <td>0.14100</td>\n",
       "      <td>0.21130</td>\n",
       "      <td>0.4107</td>\n",
       "      <td>0.2216</td>\n",
       "      <td>0.2060</td>\n",
       "      <td>0.07115</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>565</th>\n",
       "      <td>20.13</td>\n",
       "      <td>28.25</td>\n",
       "      <td>131.20</td>\n",
       "      <td>1261.0</td>\n",
       "      <td>0.09780</td>\n",
       "      <td>0.10340</td>\n",
       "      <td>0.14400</td>\n",
       "      <td>0.09791</td>\n",
       "      <td>0.1752</td>\n",
       "      <td>0.05533</td>\n",
       "      <td>...</td>\n",
       "      <td>38.25</td>\n",
       "      <td>155.00</td>\n",
       "      <td>1731.0</td>\n",
       "      <td>0.11660</td>\n",
       "      <td>0.19220</td>\n",
       "      <td>0.3215</td>\n",
       "      <td>0.1628</td>\n",
       "      <td>0.2572</td>\n",
       "      <td>0.06637</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>566</th>\n",
       "      <td>16.60</td>\n",
       "      <td>28.08</td>\n",
       "      <td>108.30</td>\n",
       "      <td>858.1</td>\n",
       "      <td>0.08455</td>\n",
       "      <td>0.10230</td>\n",
       "      <td>0.09251</td>\n",
       "      <td>0.05302</td>\n",
       "      <td>0.1590</td>\n",
       "      <td>0.05648</td>\n",
       "      <td>...</td>\n",
       "      <td>34.12</td>\n",
       "      <td>126.70</td>\n",
       "      <td>1124.0</td>\n",
       "      <td>0.11390</td>\n",
       "      <td>0.30940</td>\n",
       "      <td>0.3403</td>\n",
       "      <td>0.1418</td>\n",
       "      <td>0.2218</td>\n",
       "      <td>0.07820</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>567</th>\n",
       "      <td>20.60</td>\n",
       "      <td>29.33</td>\n",
       "      <td>140.10</td>\n",
       "      <td>1265.0</td>\n",
       "      <td>0.11780</td>\n",
       "      <td>0.27700</td>\n",
       "      <td>0.35140</td>\n",
       "      <td>0.15200</td>\n",
       "      <td>0.2397</td>\n",
       "      <td>0.07016</td>\n",
       "      <td>...</td>\n",
       "      <td>39.42</td>\n",
       "      <td>184.60</td>\n",
       "      <td>1821.0</td>\n",
       "      <td>0.16500</td>\n",
       "      <td>0.86810</td>\n",
       "      <td>0.9387</td>\n",
       "      <td>0.2650</td>\n",
       "      <td>0.4087</td>\n",
       "      <td>0.12400</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>568</th>\n",
       "      <td>7.76</td>\n",
       "      <td>24.54</td>\n",
       "      <td>47.92</td>\n",
       "      <td>181.0</td>\n",
       "      <td>0.05263</td>\n",
       "      <td>0.04362</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.1587</td>\n",
       "      <td>0.05884</td>\n",
       "      <td>...</td>\n",
       "      <td>30.37</td>\n",
       "      <td>59.16</td>\n",
       "      <td>268.6</td>\n",
       "      <td>0.08996</td>\n",
       "      <td>0.06444</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.2871</td>\n",
       "      <td>0.07039</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>569 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
       "0          17.99         10.38          122.80     1001.0          0.11840   \n",
       "1          20.57         17.77          132.90     1326.0          0.08474   \n",
       "2          19.69         21.25          130.00     1203.0          0.10960   \n",
       "3          11.42         20.38           77.58      386.1          0.14250   \n",
       "4          20.29         14.34          135.10     1297.0          0.10030   \n",
       "..           ...           ...             ...        ...              ...   \n",
       "564        21.56         22.39          142.00     1479.0          0.11100   \n",
       "565        20.13         28.25          131.20     1261.0          0.09780   \n",
       "566        16.60         28.08          108.30      858.1          0.08455   \n",
       "567        20.60         29.33          140.10     1265.0          0.11780   \n",
       "568         7.76         24.54           47.92      181.0          0.05263   \n",
       "\n",
       "     mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
       "0             0.27760         0.30010              0.14710         0.2419   \n",
       "1             0.07864         0.08690              0.07017         0.1812   \n",
       "2             0.15990         0.19740              0.12790         0.2069   \n",
       "3             0.28390         0.24140              0.10520         0.2597   \n",
       "4             0.13280         0.19800              0.10430         0.1809   \n",
       "..                ...             ...                  ...            ...   \n",
       "564           0.11590         0.24390              0.13890         0.1726   \n",
       "565           0.10340         0.14400              0.09791         0.1752   \n",
       "566           0.10230         0.09251              0.05302         0.1590   \n",
       "567           0.27700         0.35140              0.15200         0.2397   \n",
       "568           0.04362         0.00000              0.00000         0.1587   \n",
       "\n",
       "     mean fractal dimension  ...  worst texture  worst perimeter  worst area  \\\n",
       "0                   0.07871  ...          17.33           184.60      2019.0   \n",
       "1                   0.05667  ...          23.41           158.80      1956.0   \n",
       "2                   0.05999  ...          25.53           152.50      1709.0   \n",
       "3                   0.09744  ...          26.50            98.87       567.7   \n",
       "4                   0.05883  ...          16.67           152.20      1575.0   \n",
       "..                      ...  ...            ...              ...         ...   \n",
       "564                 0.05623  ...          26.40           166.10      2027.0   \n",
       "565                 0.05533  ...          38.25           155.00      1731.0   \n",
       "566                 0.05648  ...          34.12           126.70      1124.0   \n",
       "567                 0.07016  ...          39.42           184.60      1821.0   \n",
       "568                 0.05884  ...          30.37            59.16       268.6   \n",
       "\n",
       "     worst smoothness  worst compactness  worst concavity  \\\n",
       "0             0.16220            0.66560           0.7119   \n",
       "1             0.12380            0.18660           0.2416   \n",
       "2             0.14440            0.42450           0.4504   \n",
       "3             0.20980            0.86630           0.6869   \n",
       "4             0.13740            0.20500           0.4000   \n",
       "..                ...                ...              ...   \n",
       "564           0.14100            0.21130           0.4107   \n",
       "565           0.11660            0.19220           0.3215   \n",
       "566           0.11390            0.30940           0.3403   \n",
       "567           0.16500            0.86810           0.9387   \n",
       "568           0.08996            0.06444           0.0000   \n",
       "\n",
       "     worst concave points  worst symmetry  worst fractal dimension  target  \n",
       "0                  0.2654          0.4601                  0.11890     0.0  \n",
       "1                  0.1860          0.2750                  0.08902     0.0  \n",
       "2                  0.2430          0.3613                  0.08758     0.0  \n",
       "3                  0.2575          0.6638                  0.17300     0.0  \n",
       "4                  0.1625          0.2364                  0.07678     0.0  \n",
       "..                    ...             ...                      ...     ...  \n",
       "564                0.2216          0.2060                  0.07115     0.0  \n",
       "565                0.1628          0.2572                  0.06637     0.0  \n",
       "566                0.1418          0.2218                  0.07820     0.0  \n",
       "567                0.2650          0.4087                  0.12400     0.0  \n",
       "568                0.0000          0.2871                  0.07039     1.0  \n",
       "\n",
       "[569 rows x 31 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    all_datas,\n",
    "    columns=list(breast_cancer['feature_names']) + ['target']\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2c9d79d",
   "metadata": {},
   "source": [
    "### 查看数据集数据分布"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4c3a6f3",
   "metadata": {},
   "source": [
    "- 拆分数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "5d6ab703",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "85e28a2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置随机数种子，更好验证stratify参数的作用\n",
    "data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.25, random_state=40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "75cd9343",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((426, 30), (143, 30), (426,), (143,))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_train.shape, data_test.shape, target_train.shape, target_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "acf95cbe",
   "metadata": {},
   "source": [
    "- 查看训练数据和测试数据中target的数据分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "87187201",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.Series(target).unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5c580bdd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "target\n",
      "1    0.627417\n",
      "0    0.372583\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "target_train\n",
      "1    0.607981\n",
      "0    0.392019\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "target_test\n",
      "1    0.685315\n",
      "0    0.314685\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "for name, array in zip(['target', 'target_train', 'target_test'], [target, target_train, target_test]):\n",
    "    print()\n",
    "    print(name)\n",
    "    print(pd.Series(array).value_counts(normalize=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "285831a0",
   "metadata": {},
   "source": [
    "### 训练集和测试集数据的均匀拆分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "e3cb7a2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# stratify  拆分标准\n",
    "data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.25, random_state=40, stratify=target)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "567b1872",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "target\n",
      "1    0.627417\n",
      "0    0.372583\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "target_train\n",
      "1    0.626761\n",
      "0    0.373239\n",
      "Name: proportion, dtype: float64\n",
      "\n",
      "target_test\n",
      "1    0.629371\n",
      "0    0.370629\n",
      "Name: proportion, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "for name, array in zip(['target', 'target_train', 'target_test'], [target, target_train, target_test]):\n",
    "    print()\n",
    "    print(name)\n",
    "    print(pd.Series(array).value_counts(normalize=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba86be66",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
